/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2005 by Myricom, Inc.  All rights reserved.                 *
 *************************************************************************/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>

#include "mx_auto_config.h"
#include "mxsmpi_wrap.h"
#include "mx_timing.h"
#include "mx__lib_types.h"
#include "mx__partner.h"
#include "mx__endpoint.h"


#define WARMUP 32

static int 
double_cmp(const void *ap, const void *bp)
{
  const double *a = ap,*b = bp;
  return *a > *b ? 1 : *a == *b ? 0 : -1;
}

struct timings {
  mx_cycles_t *cc;
  double *sec;
  double avg;
};

void timings_alloc(struct timings *t, int iter)
{
  t->sec = calloc(iter,sizeof(t->sec[0]));
  t->cc = calloc(iter+WARMUP+1,sizeof(t->cc[0]));
}

void timings_process(struct timings *t, int iter)
{
  int i;
  mx_cycles_t sum;
  memmove(t->cc,t->cc+WARMUP,sizeof(t->cc[0])*(iter+1));
  sum = 0;
  for (i=0;i<iter;i++) {
    t->sec[i] = t->cc[i]*mx_seconds_per_cycle()*1e6;
    sum += t->cc[i];
  }
  t->avg = sum * mx_seconds_per_cycle()*1e6/iter;
  qsort(t->sec, iter, sizeof(t->sec[0]), double_cmp);
}

int main(int argc, char **argv)
{
  char * sbuf, *rbuf;
  int i,k;
  int nprocs, myrank;
  struct timings timings;
  struct timings isnd;
  struct timings sndact;
  struct timings ircv;
  struct timings rcvact;
  mx_segment_t sseg;
  mx_segment_t rseg;
  struct mx_endpoint *ep;
  struct mxsmpi_peer *peers;
  mx_status_t s;
  mx_request_t req;
  uint32_t done;

  int length = argc > 1 ? atoi(argv[1]) : 0;
  int iter = argc > 2 ? atoi(argv[2]) : 10;

  timings_alloc(&timings,iter);
  timings_alloc(&isnd,iter);
  timings_alloc(&sndact,iter);
  timings_alloc(&ircv,iter);
  timings_alloc(&rcvact,iter);
  
  mx_cycles_counter_init();
  MPI_Init(&argc,&argv);
  MPI_Barrier(MPI_COMM_WORLD);
  ep = MPI_COMM_WORLD->ep;
  peers = MPI_COMM_WORLD->peers;
  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
  if (nprocs != 2) {
    fprintf(stderr,"cpu_usage requires two nodes(I am %d out of %d)\n",
	    myrank, nprocs);
    exit(1);
  } else {
    fprintf(stderr,"process %d out of %d started\n", myrank, nprocs);
  }
  sbuf = malloc(length);
  rbuf = malloc(length);
  assert(sbuf && rbuf);
  sseg.segment_length = length;
  sseg.segment_ptr = sbuf;
  rseg.segment_length = length;
  rseg.segment_ptr = rbuf;
  for (k=0;k< 2;k++) {
  if (myrank == 0) {

    for (i=0;i<iter+WARMUP;i++) {
      mx_cycles_t t1,t2;
      unsigned evt_idx;
      t1 = mx_get_cycles();
      timings.cc[i] = t1;
      mx_isend(ep, &sseg, 1, peers[1].addr, 0xaa00, NULL, &req);
      t2 = mx_get_cycles();
      isnd.cc[i] = t2 - t1;
      sndact.cc[i] = 0;
      evt_idx = ep->eventq_index;
      do {
	assert(evt_idx = ep->eventq_index);
	t1 = mx_get_cycles();
	mx_test(ep, &req, &s, &done);
	t2 = mx_get_cycles();
	if (done || ep->eventq_index != evt_idx) {
	  sndact.cc[i] += t2 - t1;
	  evt_idx = ep->eventq_index;
	}
      } while (!done);
      t1 = mx_get_cycles();
      mx_irecv(ep, &rseg, 1, 0xaa11, MX_MATCH_MASK_NONE, NULL, &req);
      t2 = mx_get_cycles();
      ircv.cc[i] = t2 - t1;
      rcvact.cc[i] = 0;
      evt_idx = ep->eventq_index;
      do {
	assert(evt_idx == ep->eventq_index);
	t1 = mx_get_cycles();
	mx_test(ep, &req, &s, &done);
	t2 = mx_get_cycles();
	if (done || ep->eventq_index != evt_idx) {
	  rcvact.cc[i] += t2 - t1;
	  evt_idx = ep->eventq_index;
	}
      } while (!done);
    }
    timings.cc[i] = mx_get_cycles();
  } else {
    for (i=0;i<iter+WARMUP;i++) {
      mx_irecv(ep, &rseg, 1, 0xaa00, MX_MATCH_MASK_NONE, NULL, &req);
      do {
	mx_test(ep, &req, &s, &done);
      } while (!done);
      mx_isend(ep, &sseg, 1, peers[0].addr, 0xaa11, NULL, &req);
      do {
	mx_test(ep, &req, &s, &done);
      } while (!done);
    }
  }
  }
  if (myrank == 0) {

    /* get individual latencies */
    for (i=0;i<iter+WARMUP;i++) {
      timings.cc[i] = timings.cc[i+1] - timings.cc[i];
    }

    timings_process(&timings, iter);
    /* get one-way latencies */
    for (i=0;i<iter;i++) {
      timings.sec[i] /= 2;
    }
    timings.avg /= 2;

    timings_process(&isnd, iter);
    timings_process(&sndact, iter);
    timings_process(&ircv, iter);
    timings_process(&rcvact, iter);

    printf("\tisnd\tsndact\tircv\trcvact\tlat\n");
    printf("avg:\t%.3fus\t%.3fus\t%.3fus\t%.3fus\t%.3fus\n",
	   isnd.avg, sndact.avg, ircv.avg, rcvact.avg, timings.avg);
    printf("median:\t%.3fus\t%.3fus\t%.3fus\t%.3fus\t%.3fus\n",
	   isnd.sec[iter/2],sndact.sec[iter/2],ircv.sec[iter/2],rcvact.sec[iter/2],timings.sec[iter/2]/2);
    printf("best:\t%.3fus\t%.3fus\t%.3fus\t%.3fus\t%.3fus\n",
	   isnd.sec[0],sndact.sec[0],ircv.sec[0],rcvact.sec[0],timings.sec[0]/2);
    printf("worst:\t%.3fus\t%.3fus\t%.3fus\t%.3fus\t%.3fus\n",
	   isnd.sec[iter-1],sndact.sec[iter-1],ircv.sec[iter-1],rcvact.sec[iter-1],timings.sec[iter-1]/2);
  }
  fprintf(stderr,"end of program\n");
  MPI_Finalize();
  
  return 0;
}
